## R Script Outputs ------------------------------------------------------------
# Appendix Table D.9: DiD Estimation with Matching: 2016--2017
# Appendix Table D.10: Improved Covariate Balance
# Appendix Figure D.2: Frequency Distribution of the Number of Matched Control Firms.


## Instructions ----------------------------------------------------------------
# Step 1: Adjust MAIN_DIR to where README.txt is located
# Step 2: Run entire script


## IMPORTANT NOTE --------------------------------------------------------------
# This script relies on Orbis' proprietary data on firms' industry, size, public 
# listing status, sales, and employment. To protect Orbis' proprietary data, I 
# exclude all Orbis data and only load resulting anonymous saved outputs for 
# replication purposes. All code for preparing the input data and implementing 
# PanelMatch are documented below.


## setup -----------------------------------------------------------------------
# clean slate
rm(list = ls())
date()

# load packages
pkg <- c("tidyverse",
         "stargazer",
         "devtools",
         "xtable",
         "gridExtra")

lapply(pkg, require, character.only = TRUE)

# install developer's version of PanelMatch 2.0.1 from Github (June 5, 2022), which 
# corrects and improves uncertainty estimation, according to the package authors
install_github("insongkim/PanelMatch", dependencies = TRUE, ref = "se_comparison")

# load library
library(PanelMatch)

# set main directory
MAIN_DIR <- "~/Dropbox/Research/JOP-h1b-replication"


## Prepare data for PanelMatch -------------------------------------------------
## NOTE: The code below show exactly how I constructed the input dataset for 
## PanelMatch using the main dataset. To protect Orbis' proprietary data, I
## am prohibited from uploading the entire data set. However, data-pm-08-17-orbis-excluded.RData 
## in the replication files shows what the input data looks like without Orbis 
## variables.
# 
# # load main dataset 
# load(file = paste(MAIN_DIR, "data-merge-91-17.RData", sep = "/"))
# 
# # make size a factor
# df <- data.merge %>%
#   mutate(size = factor(size, levels = c("Small", "Medium", "Large", "Very Large")))
# 
# # subset years
# df.sub.08.17 <- df %>%
#   filter(year >= 2008 & year <= 2017)
# 
# # check number of unique bvd_ids
# n_distinct(df.sub.08.17$bvd_id)
# 
# # create placeholder for each firm-year; fill in any missing rows between years when firms first occurred and last existed
# df.ph.08.17 <- df.sub.08.17 %>%
#   select(bvd_id, year) %>%
#   group_by(bvd_id) %>% 
#   complete(year = full_seq(year, 1)) %>%
#   ungroup()
# 
# # merge and create variables
# df.pm.08.17 <- left_join(df.ph.08.17, df.sub.08.17, by = c("bvd_id", "year")) %>% 
#   group_by(bvd_id) %>% 
#   mutate(bvd_id_int = cur_group_id()) %>% # PM ids need to be integers
#   ungroup() %>%
#   select(bvd_id, 
#          bvd_id_int,
#          bvd_name,
#          year, 
#          post_2017,
#          h1b_deny_rate,
#          lob_img_2017, 
#          lob_text_hs_h1b_visa_2017, 
#          lob_text_hs_h1b_visa_wh_eop_2017,
#          lob_text_hs_h1b_visa_uscis_2017, 
#          lob_text_hs_h1b_visa_dhs_2017,
#          lob_text_hs_h1b_visa_uscis_dhs_2017, 
#          lob_text_hs_h1b_visa_dol_2017,
#          lob_text_hs_h1b_visa_only_congress_2017,
#          naics_core,
#          size,
#          public, 
#          n_subsidiary,
#          sales, 
#          n_employ) %>%
#   mutate(treat = ifelse(lob_img_2017 == 1 & post_2017 == 1, 1, 0),
#          treat = ifelse(is.na(treat), 0, treat),
#          
#          treat_text = ifelse(lob_text_hs_h1b_visa_2017 == 1 & post_2017 == 1, 1, 0),
#          treat_text = ifelse(is.na(treat_text), 0, treat_text),
#          
#          treat_text_uscis = ifelse(lob_text_hs_h1b_visa_uscis_2017 == 1 & post_2017 == 1, 1, 0),
#          treat_text_uscis = ifelse(is.na(treat_text_uscis), 0, treat_text_uscis),
#          
#          treat_text_dol = ifelse(lob_text_hs_h1b_visa_dol_2017 == 1 & post_2017 == 1, 1, 0),
#          treat_text_dol = ifelse(is.na(treat_text_dol), 0, treat_text_dol),
#          
#          treat_text_dhs = ifelse(lob_text_hs_h1b_visa_dhs_2017 == 1 & post_2017 == 1, 1, 0),
#          treat_text_dhs = ifelse(is.na(treat_text_dhs), 0, treat_text_dhs),
#          
#          treat_text_wh_eop = ifelse(lob_text_hs_h1b_visa_wh_eop_2017 == 1 & post_2017 == 1, 1, 0),
#          treat_text_wh_eop = ifelse(is.na(treat_text_wh_eop), 0, treat_text_wh_eop),
#          
#          treat_text_only_congress = ifelse(lob_text_hs_h1b_visa_only_congress_2017 == 1 & post_2017 == 1, 1, 0),
#          treat_text_only_congress = ifelse(is.na(treat_text_only_congress), 0, treat_text_only_congress),
#          
#          year = as.integer(year), # numeric values required by PM
#          
#          naics_core_int = as.numeric(naics_core), 
#          naics_2d = str_sub(naics_core, 1, 2),
#          naics_2d = ifelse(naics_2d == 31 | naics_2d == 32 | naics_2d == 33, "31-33", naics_2d),
#          naics_2d = ifelse(naics_2d == 44 | naics_2d == 45, "44-45", naics_2d),
#          naics_2d = ifelse(naics_2d == 48 | naics_2d == 49, "48-49", naics_2d),
#          naics_2d_factor = as.factor(naics_2d),
#          naics_2d_factor = as.numeric(naics_2d_factor),
#          size_int = as.numeric(size)) %>% 
#   group_by(bvd_id) %>%
#   fill(public, .direction = "updown") %>%
#   fill(naics_core, .direction = "updown") %>%
#   fill(naics_core_int, .direction = "updown") %>%
#   fill(naics_2d, .direction = "updown") %>%
#   fill(naics_2d_factor, .direction = "updown") %>%
#   fill(size, .direction = "updown") %>%
#   fill(size_int, .direction = "updown") %>%
#   ungroup() %>%
#   select(bvd_id, 
#          bvd_id_int,
#          bvd_name,
#          year, 
#          h1b_deny_rate,
#          treat, 
#          treat_text,
#          treat_text_uscis,
#          treat_text_dol,
#          treat_text_dhs,
#          treat_text_wh_eop,
#          treat_text_only_congress,
#          public, 
#          naics_core,
#          naics_core_int,
#          naics_2d,
#          naics_2d_factor,
#          size,
#          size_int,
#          n_subsidiary,
#          sales, 
#          n_employ) 
# 
# # PM requires inputs as data.frame
# df.pm.08.17 <- as.data.frame(df.pm.08.17)
# 
# # save dataset excluding Orbis data (publicly shared)
# df.pm.08.17.orbis.excluded <- df.pm.08.17 %>%
#   select(-public,
#          -naics_core,
#          -naics_core_int,
#          -naics_2d,
#          -naics_2d_factor,
#          -size,
#          -size_int,
#          -n_subsidiary,
#          -sales,
#          -n_employ)
# 
# # save data
# save(df.pm.08.17, file = paste(MAIN_DIR, "data-pm-08-17.RData", sep = "/"))
# save(df.pm.08.17.orbis.excluded, file = paste(MAIN_DIR, "data-pm-08-17-orbis-excluded.RData", sep = "/"))



## implement PanelMatch --------------------------------------------------------
## NOTE: The code below show exactly how I conducted the analysis using 
## PanelMatch and the input data from above.
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat
# # panelmatch
# PM.results.08.17.l1 <- PanelMatch(lag = 1, 
#                                   time.id = "year", 
#                                   unit.id = "bvd_id_int", 
#                                   treatment = "treat", 
#                                   #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                   refinement.method = "none", 
#                                   data = df.pm.08.17, 
#                                   match.missing = TRUE, 
#                                   verbose = TRUE,
#                                   qoi = "att",
#                                   outcome.var = "h1b_deny_rate",
#                                   lead = 0, 
#                                   forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1 <- get_covariate_balance(PM.results.08.17.l1$att,
#                                           data = df.pm.08.17,
#                                           covariates = c("h1b_deny_rate", 
#                                                          "naics_2d_factor", 
#                                                          "size_int", 
#                                                          "public", 
#                                                          "sales", 
#                                                          "n_employ"),
#                                           plot = FALSE,
#                                           ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1 <- PanelEstimate(sets = PM.results.08.17.l1, 
#                                      data = df.pm.08.17)
# summary(PE.results.08.17.l1)
# 
# # save
# save(PM.results.08.17.l1, file = paste(MAIN_DIR, "PM-08-17-l1.RData", sep = "/"))
# save(balance.08.17.l1, file = paste(MAIN_DIR, "balance-08-17-l1.RData", sep = "/"))
# save(PE.results.08.17.l1, file = paste(MAIN_DIR, "PE-08-17-l1.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat
# # panelmatch
# PM.results.08.17.l1.em <- PanelMatch(lag = 1, 
#                                      time.id = "year", 
#                                      unit.id = "bvd_id_int", 
#                                      treatment = "treat", 
#                                      exact.match.variables = c("naics_2d_factor", "size_int"),
#                                      refinement.method = "none", 
#                                      data = df.pm.08.17, 
#                                      match.missing = TRUE, 
#                                      verbose = TRUE,
#                                      qoi = "att",
#                                      outcome.var = "h1b_deny_rate",
#                                      lead = 0, 
#                                      forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.em <- get_covariate_balance(PM.results.08.17.l1.em$att,
#                                              data = df.pm.08.17,
#                                              covariates = c("h1b_deny_rate", 
#                                                             "naics_2d_factor", 
#                                                             "size_int", 
#                                                             "public", 
#                                                             "sales", 
#                                                             "n_employ"),
#                                              plot = FALSE,
#                                              ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.em <- PanelEstimate(sets = PM.results.08.17.l1.em, 
#                                         data = df.pm.08.17)
# summary(PE.results.08.17.l1.em)
# 
# # save 
# save(PM.results.08.17.l1.em, file = paste(MAIN_DIR, "PM-08-17-l1-em.RData", sep = "/"))
# save(balance.08.17.l1.em, file = paste(MAIN_DIR, "balance-08-17-l1-em.RData", sep = "/"))
# save(PE.results.08.17.l1.em, file = paste(MAIN_DIR, "PE-08-17-l1-em.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat_text
# # panelmatch
# PM.results.08.17.l1.text <- PanelMatch(lag = 1, 
#                                        time.id = "year", 
#                                        unit.id = "bvd_id_int", 
#                                        treatment = "treat_text", 
#                                        #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                        refinement.method = "none",
#                                        data = df.pm.08.17, 
#                                        match.missing = TRUE, 
#                                        verbose = TRUE,
#                                        qoi = "att",
#                                        outcome.var = "h1b_deny_rate",
#                                        lead = 0, 
#                                        forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text <- get_covariate_balance(PM.results.08.17.l1.text$att,
#                                                data = df.pm.08.17,
#                                                covariates = c("h1b_deny_rate", 
#                                                               "naics_2d_factor", 
#                                                               "size_int", 
#                                                               "public", 
#                                                               "sales", 
#                                                               "n_employ"),
#                                                plot = FALSE,
#                                                ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text <- PanelEstimate(sets = PM.results.08.17.l1.text, 
#                                           data = df.pm.08.17)
# summary(PE.results.08.17.l1.text)
# 
# # save results 
# save(PM.results.08.17.l1.text, file = paste(MAIN_DIR, "PM-08-17-l1-text.RData", sep = "/"))
# save(balance.08.17.l1.text, file = paste(MAIN_DIR, "balance-08-17-l1-text.RData", sep = "/"))
# save(PE.results.08.17.l1.text, file = paste(MAIN_DIR, "PE-08-17-l1-text.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat_text
# # panelmatch
# PM.results.08.17.l1.text.em <- PanelMatch(lag = 1, 
#                                           time.id = "year", 
#                                           unit.id = "bvd_id_int", 
#                                           treatment = "treat_text", 
#                                           exact.match.variables = c("naics_2d_factor", "size_int"),
#                                           refinement.method = "none", 
#                                           data = df.pm.08.17, 
#                                           match.missing = TRUE, 
#                                           verbose = TRUE,
#                                           qoi = "att",
#                                           outcome.var = "h1b_deny_rate",
#                                           lead = 0, 
#                                           forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.em <- get_covariate_balance(PM.results.08.17.l1.text.em$att,
#                                                   data = df.pm.08.17,
#                                                   covariates = c("h1b_deny_rate", 
#                                                                  "naics_2d_factor", 
#                                                                  "size_int", 
#                                                                  "public", 
#                                                                  "sales", 
#                                                                  "n_employ"),
#                                                   plot = FALSE,
#                                                   ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.em <- PanelEstimate(sets = PM.results.08.17.l1.text.em, 
#                                              data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.em)
# 
# # save results 
# save(PM.results.08.17.l1.text.em, file = paste(MAIN_DIR, "PM-08-17-l1-text-em.RData", sep = "/"))
# save(balance.08.17.l1.text.em, file = paste(MAIN_DIR, "balance-08-17-l1-text-em.RData", sep = "/"))
# save(PE.results.08.17.l1.text.em, file = paste(MAIN_DIR, "PE-08-17-l1-text-em.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat_text_uscis
# # panelmatch
# PM.results.08.17.l1.text.uscis <- PanelMatch(lag = 1, 
#                                              time.id = "year", 
#                                              unit.id = "bvd_id_int", 
#                                              treatment = "treat_text_uscis", 
#                                              #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                              refinement.method = "none", 
#                                              data = df.pm.08.17, 
#                                              match.missing = TRUE, 
#                                              verbose = TRUE,
#                                              qoi = "att",
#                                              outcome.var = "h1b_deny_rate",
#                                              lead = 0, 
#                                              forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.uscis <- get_covariate_balance(PM.results.08.17.l1.text.uscis$att,
#                                                      data = df.pm.08.17,
#                                                      covariates = c("h1b_deny_rate", 
#                                                                     "naics_2d_factor", 
#                                                                     "size_int", 
#                                                                     "public", 
#                                                                     "sales", 
#                                                                     "n_employ"),
#                                                      plot = FALSE,
#                                                      ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.uscis <- PanelEstimate(sets = PM.results.08.17.l1.text.uscis, 
#                                                 data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.uscis)
# 
# # save results 
# save(PM.results.08.17.l1.text.uscis, file = paste(MAIN_DIR, "PM-08-17-l1-text-uscis.RData", sep = "/"))
# save(balance.08.17.l1.text.uscis, file = paste(MAIN_DIR, "balance-08-17-l1-text-uscis.RData", sep = "/"))
# save(PE.results.08.17.l1.text.uscis, file = paste(MAIN_DIR, "PE-08-17-l1-text-uscis.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat_text_uscis
# # panelmatch
# PM.results.08.17.l1.text.uscis.em <- PanelMatch(lag = 1, 
#                                                 time.id = "year", 
#                                                 unit.id = "bvd_id_int", 
#                                                 treatment = "treat_text_uscis", 
#                                                 exact.match.variables = c("naics_2d_factor", "size_int"),
#                                                 refinement.method = "none", 
#                                                 data = df.pm.08.17, 
#                                                 match.missing = TRUE, 
#                                                 verbose = TRUE,
#                                                 qoi = "att",
#                                                 outcome.var = "h1b_deny_rate",
#                                                 lead = 0, 
#                                                 forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.uscis.em <- get_covariate_balance(PM.results.08.17.l1.text.uscis.em$att,
#                                                         data = df.pm.08.17,
#                                                         covariates = c("h1b_deny_rate", 
#                                                                        "naics_2d_factor", 
#                                                                        "size_int", 
#                                                                        "public", 
#                                                                        #"n_subsidiary", 
#                                                                        "sales", 
#                                                                        "n_employ"
#                                                         ),
#                                                         plot = FALSE,
#                                                         ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.uscis.em <- PanelEstimate(sets = PM.results.08.17.l1.text.uscis.em, 
#                                                    data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.uscis.em)
# 
# # save results 
# save(PM.results.08.17.l1.text.uscis.em, file = paste(MAIN_DIR, "PM-08-17-l1-text-uscis-em.RData", sep = "/"))
# save(balance.08.17.l1.text.uscis.em, file = paste(MAIN_DIR, "balance-08-17-l1-text-uscis-em.RData", sep = "/"))
# save(PE.results.08.17.l1.text.uscis.em, file = paste(MAIN_DIR, "PE-08-17-l1-text-uscis-em.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat_text_dol
# # panelmatch
# PM.results.08.17.l1.text.dol <- PanelMatch(lag = 1, 
#                                            time.id = "year", 
#                                            unit.id = "bvd_id_int", 
#                                            treatment = "treat_text_dol", 
#                                            #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                            refinement.method = "none", 
#                                            data = df.pm.08.17, 
#                                            match.missing = TRUE, 
#                                            verbose = TRUE,
#                                            qoi = "att",
#                                            outcome.var = "h1b_deny_rate",
#                                            lead = 0, 
#                                            forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.dol <- get_covariate_balance(PM.results.08.17.l1.text.dol$att,
#                                                    data = df.pm.08.17,
#                                                    covariates = c("h1b_deny_rate", 
#                                                                   "naics_2d_factor", 
#                                                                   "size_int", 
#                                                                   "public", 
#                                                                   "sales", 
#                                                                   "n_employ"),
#                                                    plot = FALSE,
#                                                    ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.dol <- PanelEstimate(sets = PM.results.08.17.l1.text.dol, 
#                                               data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.dol)
# 
# # save results 
# save(PM.results.08.17.l1.text.dol, file = paste(MAIN_DIR, "PM-08-17-l1-text-dol.RData", sep = "/"))
# save(balance.08.17.l1.text.dol, file = paste(MAIN_DIR, "balance-08-17-l1-text-dol.RData", sep = "/"))
# save(PE.results.08.17.l1.text.dol, file = paste(MAIN_DIR, "PE-08-17-l1-text-dol.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat_text_dol
# # panelmatch
# PM.results.08.17.l1.text.dol.em <- PanelMatch(lag = 1, 
#                                               time.id = "year", 
#                                               unit.id = "bvd_id_int", 
#                                               treatment = "treat_text_dol", 
#                                               exact.match.variables = c("naics_2d_factor", "size_int"),
#                                               refinement.method = "none", 
#                                               data = df.pm.08.17, 
#                                               match.missing = TRUE, 
#                                               verbose = TRUE,
#                                               qoi = "att",
#                                               outcome.var = "h1b_deny_rate",
#                                               lead = 0, 
#                                               forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.dol.em <- get_covariate_balance(PM.results.08.17.l1.text.dol.em$att,
#                                                       data = df.pm.08.17,
#                                                       covariates = c("h1b_deny_rate", 
#                                                                      "naics_2d_factor", 
#                                                                      "size_int", 
#                                                                      "public", 
#                                                                      "sales", 
#                                                                      "n_employ"),
#                                                       plot = FALSE,
#                                                       ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.dol.em <- PanelEstimate(sets = PM.results.08.17.l1.text.dol.em, 
#                                                  data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.dol.em)
# 
# # save results 
# save(PM.results.08.17.l1.text.dol.em, file = paste(MAIN_DIR, "PM-08-17-l1-text-dol-em.RData", sep = "/"))
# save(balance.08.17.l1.text.dol.em, file = paste(MAIN_DIR, "balance-08-17-l1-text-dol-em.RData", sep = "/"))
# save(PE.results.08.17.l1.text.dol.em, file = paste(MAIN_DIR, "PE-08-17-l1-text-dol-em.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat_text_dhs
# # panelmatch
# PM.results.08.17.l1.text.dhs <- PanelMatch(lag = 1, 
#                                            time.id = "year", 
#                                            unit.id = "bvd_id_int", 
#                                            treatment = "treat_text_dhs", 
#                                            #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                            refinement.method = "none", 
#                                            data = df.pm.08.17, 
#                                            match.missing = TRUE, 
#                                            verbose = TRUE,
#                                            qoi = "att",
#                                            outcome.var = "h1b_deny_rate",
#                                            lead = 0, 
#                                            forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.dhs <- get_covariate_balance(PM.results.08.17.l1.text.dhs$att,
#                                                    data = df.pm.08.17,
#                                                    covariates = c("h1b_deny_rate", 
#                                                                   "naics_2d_factor", 
#                                                                   "size_int", 
#                                                                   "public", 
#                                                                   "sales", 
#                                                                   "n_employ"),
#                                                    plot = FALSE,
#                                                    ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.dhs <- PanelEstimate(sets = PM.results.08.17.l1.text.dhs, 
#                                               data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.dhs)
# 
# # save results 
# save(PM.results.08.17.l1.text.dhs, file = paste(MAIN_DIR, "PM-08-17-l1-text-dhs.RData", sep = "/"))
# save(balance.08.17.l1.text.dhs, file = paste(MAIN_DIR, "balance-08-17-l1-text-dhs.RData", sep = "/"))
# save(PE.results.08.17.l1.text.dhs, file = paste(MAIN_DIR, "PE-08-17-l1-text-dhs.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat_text_dhs
# # panelmatch
# PM.results.08.17.l1.text.dhs.em <- PanelMatch(lag = 1, 
#                                               time.id = "year", 
#                                               unit.id = "bvd_id_int", 
#                                               treatment = "treat_text_dhs", 
#                                               exact.match.variables = c("naics_2d_factor", "size_int"),
#                                               refinement.method = "none", 
#                                               data = df.pm.08.17, 
#                                               match.missing = TRUE, 
#                                               verbose = TRUE,
#                                               qoi = "att",
#                                               outcome.var = "h1b_deny_rate",
#                                               lead = 0, 
#                                               forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.dhs.em <- get_covariate_balance(PM.results.08.17.l1.text.dhs.em$att,
#                                                       data = df.pm.08.17,
#                                                       covariates = c("h1b_deny_rate", 
#                                                                      "naics_2d_factor", 
#                                                                      "size_int", 
#                                                                      "public", 
#                                                                      "sales", 
#                                                                      "n_employ"),
#                                                       plot = FALSE,
#                                                       ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.dhs.em <- PanelEstimate(sets = PM.results.08.17.l1.text.dhs.em, 
#                                                  data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.dhs.em)
# 
# # save results 
# save(PM.results.08.17.l1.text.dhs.em, file = paste(MAIN_DIR, "PM-08-17-l1-text-dhs-em.RData", sep = "/"))
# save(balance.08.17.l1.text.dhs.em, file = paste(MAIN_DIR, "balance-08-17-l1-text-dhs-em.RData", sep = "/"))
# save(PE.results.08.17.l1.text.dhs.em, file = paste(MAIN_DIR, "PE-08-17-l1-text-dhs-em.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat_text_wh_eop
# # panelmatch
# PM.results.08.17.l1.text.wh.eop <- PanelMatch(lag = 1, 
#                                               time.id = "year", 
#                                               unit.id = "bvd_id_int", 
#                                               treatment = "treat_text_wh_eop", 
#                                               #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                               refinement.method = "none", 
#                                               data = df.pm.08.17, 
#                                               match.missing = TRUE, 
#                                               verbose = TRUE,
#                                               qoi = "att",
#                                               outcome.var = "h1b_deny_rate",
#                                               lead = 0, 
#                                               forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.wh.eop <- get_covariate_balance(PM.results.08.17.l1.text.wh.eop$att,
#                                                       data = df.pm.08.17,
#                                                       covariates = c("h1b_deny_rate", 
#                                                                      "naics_2d_factor", 
#                                                                      "size_int", 
#                                                                      "public", 
#                                                                      "sales", 
#                                                                      "n_employ"),
#                                                       plot = FALSE,
#                                                       ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.wh.eop <- PanelEstimate(sets = PM.results.08.17.l1.text.wh.eop, 
#                                                  data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.wh.eop)
# 
# # save results 
# save(PM.results.08.17.l1.text.wh.eop, file = paste(MAIN_DIR, "PM-08-17-l1-text-wh-eop.RData", sep = "/"))
# save(balance.08.17.l1.text.wh.eop, file = paste(MAIN_DIR, "balance-08-17-l1-text-wh-eop.RData", sep = "/"))
# save(PE.results.08.17.l1.text.wh.eop, file = paste(MAIN_DIR, "PE-08-17-l1-text-wh-eop.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat_text_wh_eop
# # panelmatch
# PM.results.08.17.l1.text.wh.eop.em <- PanelMatch(lag = 1, 
#                                                  time.id = "year", 
#                                                  unit.id = "bvd_id_int", 
#                                                  treatment = "treat_text_wh_eop", 
#                                                  exact.match.variables = c("naics_2d_factor", "size_int"),
#                                                  refinement.method = "none", 
#                                                  data = df.pm.08.17, 
#                                                  match.missing = TRUE, 
#                                                  verbose = TRUE,
#                                                  qoi = "att",
#                                                  outcome.var = "h1b_deny_rate",
#                                                  lead = 0, 
#                                                  forbid.treatment.reversal = FALSE)
# # balance
# balance.08.17.l1.text.wh.eop.em <- get_covariate_balance(PM.results.08.17.l1.text.wh.eop.em$att,
#                                                          data = df.pm.08.17,
#                                                          covariates = c("h1b_deny_rate", 
#                                                                         "naics_2d_factor", 
#                                                                         "size_int", 
#                                                                         "public", 
#                                                                         "sales", 
#                                                                         "n_employ"),
#                                                          plot = FALSE,
#                                                          ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.wh.eop.em <- PanelEstimate(sets = PM.results.08.17.l1.text.wh.eop.em, 
#                                                     data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.wh.eop.em)
# 
# # save results 
# save(PM.results.08.17.l1.text.wh.eop.em, file = paste(MAIN_DIR, "PM-08-17-l1-text-wh-eop-em.RData", sep = "/"))
# save(balance.08.17.l1.text.wh.eop.em, file = paste(MAIN_DIR, "balance-08-17-l1-text-wh-eop-em.RData", sep = "/"))
# save(PE.results.08.17.l1.text.wh.eop.em, file = paste(MAIN_DIR, "PE-08-17-l1-text-wh-eop-em.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, without exact match, treat_text_only_congress
# # panelmatch
# PM.results.08.17.l1.text.only.congress <- PanelMatch(lag = 1, 
#                                                      time.id = "year", 
#                                                      unit.id = "bvd_id_int", 
#                                                      treatment = "treat_text_only_congress", 
#                                                      #exact.match.variables = c("naics_2d_factor", "size_int"),
#                                                      refinement.method = "none", 
#                                                      data = df.pm.08.17, 
#                                                      match.missing = TRUE, 
#                                                      verbose = TRUE,
#                                                      qoi = "att",
#                                                      outcome.var = "h1b_deny_rate",
#                                                      lead = 0, 
#                                                      forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.only.congress <- get_covariate_balance(PM.results.08.17.l1.text.only.congress$att,
#                                                              data = df.pm.08.17,
#                                                              covariates = c("h1b_deny_rate", 
#                                                                             "naics_2d_factor", 
#                                                                             "size_int", 
#                                                                             "public", 
#                                                                             "sales", 
#                                                                             "n_employ"),
#                                                              plot = FALSE,
#                                                              ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.only.congress <- PanelEstimate(sets = PM.results.08.17.l1.text.only.congress, 
#                                                         data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.only.congress)
# 
# # save results 
# save(PM.results.08.17.l1.text.only.congress, file = paste(MAIN_DIR, "PM-08-17-l1-text-only-congress.RData", sep = "/"))
# save(balance.08.17.l1.text.only.congress, file = paste(MAIN_DIR, "balance-08-17-l1-text-only-congress.RData", sep = "/"))
# save(PE.results.08.17.l1.text.only.congress, file = paste(MAIN_DIR, "PE-08-17-l1-text-only-congress.RData", sep = "/"))
# 
# 
# # 10 years: 2008 to 2017, lag = 1, exact match, treat_text_only_congress
# # panelmatch
# PM.results.08.17.l1.text.only.congress.em <- PanelMatch(lag = 1, 
#                                                         time.id = "year", 
#                                                         unit.id = "bvd_id_int", 
#                                                         treatment = "treat_text_only_congress", 
#                                                         exact.match.variables = c("naics_2d_factor", "size_int"),
#                                                         refinement.method = "none", 
#                                                         data = df.pm.08.17, 
#                                                         match.missing = TRUE, 
#                                                         verbose = TRUE,
#                                                         qoi = "att",
#                                                         outcome.var = "h1b_deny_rate",
#                                                         lead = 0, 
#                                                         forbid.treatment.reversal = FALSE)
# 
# # balance
# balance.08.17.l1.text.only.congress.em <- get_covariate_balance(PM.results.08.17.l1.text.only.congress.em$att,
#                                                                 data = df.pm.08.17,
#                                                                 covariates = c("h1b_deny_rate", 
#                                                                                "naics_2d_factor", 
#                                                                                "size_int", 
#                                                                                "public", 
#                                                                                #"n_subsidiary", 
#                                                                                "sales", 
#                                                                                "n_employ"
#                                                                 ),
#                                                                 plot = FALSE,
#                                                                 ylim = c(-2,2))
# 
# # estimation
# PE.results.08.17.l1.text.only.congress.em <- PanelEstimate(sets = PM.results.08.17.l1.text.only.congress.em, 
#                                                            data = df.pm.08.17)
# summary(PE.results.08.17.l1.text.only.congress.em)
# 
# # save results 
# save(PM.results.08.17.l1.text.only.congress.em, file = paste(MAIN_DIR, "PM-08-17-l1-text-only-congress-em.RData", sep = "/"))
# save(balance.08.17.l1.text.only.congress.em, file = paste(MAIN_DIR, "balance-08-17-l1-text-only-congress-em.RData", sep = "/"))
# save(PE.results.08.17.l1.text.only.congress.em, file = paste(MAIN_DIR, "PE-08-17-l1-text-only-congress-em.RData", sep = "/"))


  
## load anonymous resulting output data for replication ------------------------
# load saved results
load(file = paste(MAIN_DIR, "PM-08-17-l1.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-em.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-em.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-uscis.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-uscis.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-uscis.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-uscis-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-uscis-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-uscis-em.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-dol.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-dol.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-dol.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-dol-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-dol-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-dol-em.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-dhs.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-dhs.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-dhs.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-dhs-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-dhs-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-dhs-em.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-wh-eop.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-wh-eop.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-wh-eop.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-wh-eop-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-wh-eop-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-wh-eop-em.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-only-congress.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-only-congress.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-only-congress.RData", sep = "/"))

load(file = paste(MAIN_DIR, "PM-08-17-l1-text-only-congress-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "balance-08-17-l1-text-only-congress-em.RData", sep = "/"))
load(file = paste(MAIN_DIR, "PE-08-17-l1-text-only-congress-em.RData", sep = "/"))


## Table D.9: DiD Estimation with Matching: 2016–2017 --------------------------
# collect results from base approach
coef.vec.l1.base <- tibble(model = c("Any-Base",
                                     "Text-Base",
                                     "Text-Congress-Base",
                                     "Text-USCIS-Base",
                                     "Text-DOL-Base",
                                     "Text-DHS-Base",
                                     "Text-WH-EOP-Base"),
                           Treatment = c("2017 IMM Lobbying (any) x Trump Administration (2017)",
                                         "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'') x Trump Administration (2017)",
                                         "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets Only Congress) x Trump Administration (2017)",
                                         "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets USCIS) x Trump Administration (2017)",
                                         "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets DOL) x Trump Administration (2017)",
                                         "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets DHS) x Trump Administration (2017)",
                                         "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets WH/EOP) x Trump Administration (2017)"),
                           coef = c(round(PE.results.08.17.l1$estimates, 3),
                                    round(PE.results.08.17.l1.text$estimates, 3),
                                    round(PE.results.08.17.l1.text.only.congress$estimates, 3),
                                    round(PE.results.08.17.l1.text.uscis$estimates, 3),
                                    round(PE.results.08.17.l1.text.dol$estimates, 3),
                                    round(PE.results.08.17.l1.text.dhs$estimates, 3),
                                    round(PE.results.08.17.l1.text.wh.eop$estimates, 3)),
                           lwr = c(round(quantile(PE.results.08.17.l1$bootstrapped.estimates, 0.025), 3),
                                   round(quantile(PE.results.08.17.l1.text$bootstrapped.estimates, 0.025), 3),
                                   round(quantile(PE.results.08.17.l1.text.only.congress$bootstrapped.estimates, 0.025), 3),
                                   round(quantile(PE.results.08.17.l1.text.uscis$bootstrapped.estimates, 0.025), 3),
                                   round(quantile(PE.results.08.17.l1.text.dol$bootstrapped.estimates, 0.025), 3),
                                   round(quantile(PE.results.08.17.l1.text.dhs$bootstrapped.estimates, 0.025), 3),
                                   round(quantile(PE.results.08.17.l1.text.wh.eop$bootstrapped.estimates, 0.025), 3)),
                           upr = c(round(quantile(PE.results.08.17.l1$bootstrapped.estimates, 0.975), 3), 
                                   round(quantile(PE.results.08.17.l1.text$bootstrapped.estimates, 0.975), 3), 
                                   round(quantile(PE.results.08.17.l1.text.only.congress$bootstrapped.estimates, 0.975), 3), 
                                   round(quantile(PE.results.08.17.l1.text.uscis$bootstrapped.estimates, 0.975), 3), 
                                   round(quantile(PE.results.08.17.l1.text.dol$bootstrapped.estimates, 0.975), 3), 
                                   round(quantile(PE.results.08.17.l1.text.dhs$bootstrapped.estimates, 0.975), 3), 
                                   round(quantile(PE.results.08.17.l1.text.wh.eop$bootstrapped.estimates, 0.975), 3)),
                           `95% C.I.` = paste("(", lwr, ", ", upr, ")", sep = ""))

# format results for table
coef.vec.l1.base.l <- coef.vec.l1.base %>%
  mutate(coef = as.character(coef)) %>%
  pivot_longer(cols = c("coef", `95% C.I.`),
               names_to = "qi",
               values_to = "Base") %>%
  select(Treatment, qi, `Base`)


# collect results from exact matching
coef.vec.l1.match <- tibble(model = c("Any-Match",
                                      "Text-Match",
                                      "Text-Congress-Match",
                                      "Text-USCIS-Match",
                                      "Text-DOL-Match",
                                      "Text-DHS-Match",
                                      "Text-WH-EOP-Match"),
                            Treatment = c("2017 IMM Lobbying (any) x Trump Administration (2017)",
                                          "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'') x Trump Administration (2017)",
                                          "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets Only Congress) x Trump Administration (2017)",
                                          "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets USCIS) x Trump Administration (2017)",
                                          "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets DOL) x Trump Administration (2017)",
                                          "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets DHS) x Trump Administration (2017)",
                                          "2017 IMM Lobbying (``Skilled''/``H-1B''/``Visa'' \\& targets WH/EOP) x Trump Administration (2017)"),
                            coef = c(round(PE.results.08.17.l1.em$estimates, 3),
                                     round(PE.results.08.17.l1.text.em$estimates, 3),
                                     round(PE.results.08.17.l1.text.only.congress.em$estimates, 3),
                                     round(PE.results.08.17.l1.text.uscis.em$estimates, 3),
                                     round(PE.results.08.17.l1.text.dol.em$estimates, 3),
                                     round(PE.results.08.17.l1.text.dhs.em$estimates, 3),
                                     round(PE.results.08.17.l1.text.wh.eop.em$estimates, 3)),
                            lwr = c(round(quantile(PE.results.08.17.l1.em$bootstrapped.estimates, 0.025), 3),
                                    round(quantile(PE.results.08.17.l1.text.em$bootstrapped.estimates, 0.025), 3),
                                    round(quantile(PE.results.08.17.l1.text.only.congress.em$bootstrapped.estimates, 0.025), 3),
                                    round(quantile(PE.results.08.17.l1.text.uscis.em$bootstrapped.estimates, 0.025), 3),
                                    round(quantile(PE.results.08.17.l1.text.dol.em$bootstrapped.estimates, 0.025), 3),
                                    round(quantile(PE.results.08.17.l1.text.dhs.em$bootstrapped.estimates, 0.025), 3),
                                    round(quantile(PE.results.08.17.l1.text.wh.eop.em$bootstrapped.estimates, 0.025), 3)),
                            upr = c(round(quantile(PE.results.08.17.l1.em$bootstrapped.estimates, 0.975), 3), 
                                    round(quantile(PE.results.08.17.l1.text.em$bootstrapped.estimates, 0.975), 3), 
                                    round(quantile(PE.results.08.17.l1.text.only.congress.em$bootstrapped.estimates, 0.975), 3), 
                                    round(quantile(PE.results.08.17.l1.text.uscis.em$bootstrapped.estimates, 0.975), 3), 
                                    round(quantile(PE.results.08.17.l1.text.dol.em$bootstrapped.estimates, 0.975), 3), 
                                    format(round(quantile(PE.results.08.17.l1.text.dhs.em$bootstrapped.estimates, 0.975), 4), scientific = FALSE, digits = 4), 
                                    round(quantile(PE.results.08.17.l1.text.wh.eop.em$bootstrapped.estimates, 0.975), 3)),
                            `95% C.I.` = paste("(", lwr, ", ", upr, ")", sep = ""))

# format results for table
coef.vec.l1.match.l <- coef.vec.l1.match %>%
  mutate(coef = as.character(coef)) %>%
  pivot_longer(cols = c("coef", `95% C.I.`),
               names_to = "qi",
               values_to = "Matching") %>%
  select(Treatment, qi, `Matching`)

# merge results
coef.l1 <- full_join(coef.vec.l1.base.l, coef.vec.l1.match.l, 
                     by = c("Treatment", "qi")) %>%
  select(-qi)

# save
print(xtable(coef.l1, 
             caption = "\\bf DiD Estimation With Matching: 2016--2017",
             label = "tb:pm",
             align = "llcc",
             type = "latex"), 
      size = "scriptsize",
      include.rownames = FALSE,
      #file = paste0(MAIN_DIR, "/Appendix-Table-D9.tex"),
      file = paste0(MAIN_DIR, "/Appendix-Table-D9.html"),
      #type = "latex",
      type = "html"
      )


## Table D.10: Improved Covariate Balance --------------------------------------
# collect results
balance.df <- data.frame(t(rbind(balance.08.17.l1,
                                 balance.08.17.l1.em,
                                 balance.08.17.l1.text,
                                 balance.08.17.l1.text.em)))

# add column names
colnames(balance.df) <- c("Any: Baseline", 
                          "Any: Matching", 
                          "Text: Baseline", 
                          "Text: Matching")

# remove row names
rownames(balance.df) <- NULL

# set variable names
balance.df$Variable <- c("H-1B Denial Rate",
                         "2-Digit NAICS Code",
                         "Size",
                         "Public",
                         "Sales",
                         "Employment")

# format results for table
balance.df <- balance.df %>%
  mutate(`Any: Baseline` = round(`Any: Baseline`, 3),
         `Any: Matching` = round(`Any: Matching`, 3),
         `Text: Baseline` = round(`Text: Baseline`, 3),
         `Text: Matching` = round(`Text: Matching`, 3)) %>%
  select(Variable, `Any: Baseline`:`Text: Matching`)

# save
print(xtable(balance.df, 
             caption = "\\bf Improved Covariate Balance",
             label = "tb:cov-bal",
             align = "llcccc",
             type = "latex",
             digits = 3), 
      size = "footnotesize",
      include.rownames = FALSE,
      #file = paste0(MAIN_DIR, "/Appendix-Table-D10.tex"),
      file = paste0(MAIN_DIR, "/Appendix-Table-D10.html"),
      #type = "latex",
      type = "html")


## Figure D.2: Frequency Distribution of the Number of Matched Control Firms ----
# collect results
match.any.em <- PM.results.08.17.l1.em$att
match.text.em <- PM.results.08.17.l1.text.em$att

# extract quantity of interest and put into data frame
distr.match.any.em <- data.frame(model = "2017 IMM Lobbying (any)",
                                 n_controls = sapply(match.any.em, length))

distr.match.text.em <- data.frame(model = "2017 IMM Lobbying (text)",
                                  n_controls = sapply(match.text.em, length))

# combine results
distr.df <- rbind(distr.match.any.em, distr.match.text.em)

# plot
axis.title.size <- 16

p.distr.em <- ggplot(distr.df, aes(x = n_controls)) + 
  facet_wrap(~ model, nrow = 1) +
  geom_histogram(binwidth = 4, 
                 fill = "black",
                 position = "identity", 
                 alpha = 1) + 
  scale_x_continuous("Number of Matched Control Firms") +
  scale_y_continuous("Frequency",
                     breaks = seq(0, 25, 5),
                     labels = seq(0, 25, 5),
                     limits = c(0, 25)) +
  theme_bw() +
  theme(plot.title = element_text(size = axis.title.size,
                                  face = "bold",
                                  margin = margin(0, 0, 10, 0),
                                  hjust = 0.5),
        axis.title.y = element_text(size = axis.title.size,
                                    margin = margin(0, 10, 0, 0)),
        axis.title.x = element_text(size = axis.title.size,
                                    margin = margin(10, 0, 0, 0)),
        axis.text = element_text(size = axis.title.size - 2),
        panel.grid.minor.x = element_blank(),
        panel.grid.minor.y = element_blank(),
        panel.grid.major.x = element_blank(),
        strip.background = element_blank(),
        strip.text = element_text(size = axis.title.size, 
                                  face = "bold"),
        legend.position = "none") 

# save
pdf(paste(MAIN_DIR, "Appendix-Figure-D2.pdf", sep = "/"), 
    width = 8.2, height = 4.1)
print(p.distr.em)
dev.off()

